# To help with reading and manipulating data
import pandas as pd
import numpy as np
# To help with data visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# To be used for missing value imputation
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
# To help with model building
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (
AdaBoostClassifier,
GradientBoostingClassifier,
RandomForestClassifier,
BaggingClassifier,
)
from xgboost import XGBClassifier
# To get different metric scores, and split data
from sklearn import metrics
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.metrics import (
f1_score,
accuracy_score,
recall_score,
precision_score,
confusion_matrix,
roc_auc_score,
plot_confusion_matrix,
)
# To be used for data scaling and one hot encoding
from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder
# To be used for tuning the model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
# To be used for creating pipelines and personalizing them
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
# To define maximum number of columns to be displayed in a dataframe
pd.set_option("display.max_columns", None)
# To supress scientific notations for a dataframe
pd.set_option("display.float_format", lambda x: "%.3f" % x)
# To impute missing values
from sklearn.impute import KNNImputer
# To build a logistic regression model
from sklearn.linear_model import LogisticRegression
# To oversample and undersample data
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
# To supress warnings
import warnings
warnings.filterwarnings("ignore")
# This will help in making the Python code more structured automatically (good coding practice)
%load_ext nb_black
# Loading the dataset
data = pd.read_csv("BankChurners.csv")
df1 = data.copy()
# let's view the first 5 rows of the data
df1.head()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 768805383 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 818770008 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 713982108 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 769911858 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 709106358 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
# let's view the last 5 rows of the data
df1.tail()
| CLIENTNUM | Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 772366833 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 710638233 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 716506083 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 717406983 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 714337233 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
# let's check the data types of the columns in the dataset
df1.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 10127 entries, 0 to 10126 Data columns (total 21 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 CLIENTNUM 10127 non-null int64 1 Attrition_Flag 10127 non-null object 2 Customer_Age 10127 non-null int64 3 Gender 10127 non-null object 4 Dependent_count 10127 non-null int64 5 Education_Level 8608 non-null object 6 Marital_Status 9378 non-null object 7 Income_Category 10127 non-null object 8 Card_Category 10127 non-null object 9 Months_on_book 10127 non-null int64 10 Total_Relationship_Count 10127 non-null int64 11 Months_Inactive_12_mon 10127 non-null int64 12 Contacts_Count_12_mon 10127 non-null int64 13 Credit_Limit 10127 non-null float64 14 Total_Revolving_Bal 10127 non-null int64 15 Avg_Open_To_Buy 10127 non-null float64 16 Total_Amt_Chng_Q4_Q1 10127 non-null float64 17 Total_Trans_Amt 10127 non-null int64 18 Total_Trans_Ct 10127 non-null int64 19 Total_Ct_Chng_Q4_Q1 10127 non-null float64 20 Avg_Utilization_Ratio 10127 non-null float64 dtypes: float64(5), int64(10), object(6) memory usage: 1.6+ MB
# let's check for duplicate values in the data
df1.duplicated().sum()
0
# let's check for missing values in the data
round(df1.isnull().sum() / df1.isnull().count() * 100, 2)
CLIENTNUM 0.000 Attrition_Flag 0.000 Customer_Age 0.000 Gender 0.000 Dependent_count 0.000 Education_Level 15.000 Marital_Status 7.400 Income_Category 0.000 Card_Category 0.000 Months_on_book 0.000 Total_Relationship_Count 0.000 Months_Inactive_12_mon 0.000 Contacts_Count_12_mon 0.000 Credit_Limit 0.000 Total_Revolving_Bal 0.000 Avg_Open_To_Buy 0.000 Total_Amt_Chng_Q4_Q1 0.000 Total_Trans_Amt 0.000 Total_Trans_Ct 0.000 Total_Ct_Chng_Q4_Q1 0.000 Avg_Utilization_Ratio 0.000 dtype: float64
# Checking for the null value in the dataset
df1.isna().sum()
CLIENTNUM 0 Attrition_Flag 0 Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 1519 Marital_Status 749 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
df1.nunique()
CLIENTNUM 10127 Attrition_Flag 2 Customer_Age 45 Gender 2 Dependent_count 6 Education_Level 6 Marital_Status 3 Income_Category 6 Card_Category 4 Months_on_book 44 Total_Relationship_Count 6 Months_Inactive_12_mon 7 Contacts_Count_12_mon 7 Credit_Limit 6205 Total_Revolving_Bal 1974 Avg_Open_To_Buy 6813 Total_Amt_Chng_Q4_Q1 1158 Total_Trans_Amt 5033 Total_Trans_Ct 126 Total_Ct_Chng_Q4_Q1 830 Avg_Utilization_Ratio 964 dtype: int64
# let's view the statistical summary of the numerical columns in the data
df1.describe().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| CLIENTNUM | 10127.000 | 739177606.334 | 36903783.450 | 708082083.000 | 713036770.500 | 717926358.000 | 773143533.000 | 828343083.000 |
| Customer_Age | 10127.000 | 46.326 | 8.017 | 26.000 | 41.000 | 46.000 | 52.000 | 73.000 |
| Dependent_count | 10127.000 | 2.346 | 1.299 | 0.000 | 1.000 | 2.000 | 3.000 | 5.000 |
| Months_on_book | 10127.000 | 35.928 | 7.986 | 13.000 | 31.000 | 36.000 | 40.000 | 56.000 |
| Total_Relationship_Count | 10127.000 | 3.813 | 1.554 | 1.000 | 3.000 | 4.000 | 5.000 | 6.000 |
| Months_Inactive_12_mon | 10127.000 | 2.341 | 1.011 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Contacts_Count_12_mon | 10127.000 | 2.455 | 1.106 | 0.000 | 2.000 | 2.000 | 3.000 | 6.000 |
| Credit_Limit | 10127.000 | 8631.954 | 9088.777 | 1438.300 | 2555.000 | 4549.000 | 11067.500 | 34516.000 |
| Total_Revolving_Bal | 10127.000 | 1162.814 | 814.987 | 0.000 | 359.000 | 1276.000 | 1784.000 | 2517.000 |
| Avg_Open_To_Buy | 10127.000 | 7469.140 | 9090.685 | 3.000 | 1324.500 | 3474.000 | 9859.000 | 34516.000 |
| Total_Amt_Chng_Q4_Q1 | 10127.000 | 0.760 | 0.219 | 0.000 | 0.631 | 0.736 | 0.859 | 3.397 |
| Total_Trans_Amt | 10127.000 | 4404.086 | 3397.129 | 510.000 | 2155.500 | 3899.000 | 4741.000 | 18484.000 |
| Total_Trans_Ct | 10127.000 | 64.859 | 23.473 | 10.000 | 45.000 | 67.000 | 81.000 | 139.000 |
| Total_Ct_Chng_Q4_Q1 | 10127.000 | 0.712 | 0.238 | 0.000 | 0.582 | 0.702 | 0.818 | 3.714 |
| Avg_Utilization_Ratio | 10127.000 | 0.275 | 0.276 | 0.000 | 0.023 | 0.176 | 0.503 | 0.999 |
# Making a list of all catrgorical variables
cat_col = [
"Attrition_Flag",
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# Printing number of count of each unique value in each column
for column in cat_col:
print(df1[column].value_counts())
print("-" * 40)
Existing Customer 8500 Attrited Customer 1627 Name: Attrition_Flag, dtype: int64 ---------------------------------------- F 5358 M 4769 Name: Gender, dtype: int64 ---------------------------------------- Graduate 3128 High School 2013 Uneducated 1487 College 1013 Post-Graduate 516 Doctorate 451 Name: Education_Level, dtype: int64 ---------------------------------------- Married 4687 Single 3943 Divorced 748 Name: Marital_Status, dtype: int64 ---------------------------------------- Less than $40K 3561 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 abc 1112 $120K + 727 Name: Income_Category, dtype: int64 ---------------------------------------- Blue 9436 Silver 555 Gold 116 Platinum 20 Name: Card_Category, dtype: int64 ----------------------------------------
# let's view the statistical summary of the non-numerical columns in the data
df1.describe(exclude=np.number).T
| count | unique | top | freq | |
|---|---|---|---|---|
| Attrition_Flag | 10127 | 2 | Existing Customer | 8500 |
| Gender | 10127 | 2 | F | 5358 |
| Education_Level | 8608 | 6 | Graduate | 3128 |
| Marital_Status | 9378 | 3 | Married | 4687 |
| Income_Category | 10127 | 6 | Less than $40K | 3561 |
| Card_Category | 10127 | 4 | Blue | 9436 |
# this loop prints the names of the columns where there is
# at least one entry ending with the characters 'abc'
Income_Category_cols = []
for colname in df1.columns[
df1.dtypes == "object"
]: # only need to consider string columns
if (
df1[colname].str.endswith("abc").any()
): # using `.str` so I can use an element-wise string method
Income_Category_cols.append(colname)
print(Income_Category_cols)
['Income_Category']
# this loop is determining which columns starts with 'abc'
Income_Category_cols_null = []
for colname in df1.columns[
df1.dtypes == "object"
]: # only need to consider string columns
if (
df1[colname].str.startswith("abc").any()
): # using `.str` so I can use an element-wise string method
Income_Category_cols_null.append(colname)
print(Income_Category_cols_null)
['Income_Category']
# replacing "abc" value count with "Less than $40K" in "Income_Category"
def Income_Category_to_str(Income_Category_val):
if isinstance(Income_Category_val, str):
return str(Income_Category_val.replace("abc", "Less than $40K"))
for colname in Income_Category_cols:
df1[colname] = df1[colname].apply(Income_Category_to_str)
df1[Income_Category_cols].head()
| Income_Category | |
|---|---|
| 0 | $60K - $80K |
| 1 | Less than $40K |
| 2 | $80K - $120K |
| 3 | Less than $40K |
| 4 | $60K - $80K |
# confirming value counts in "Income_Category"
df1["Income_Category"].value_counts()
Less than $40K 4673 $40K - $60K 1790 $80K - $120K 1535 $60K - $80K 1402 $120K + 727 Name: Income_Category, dtype: int64
# function to plot a boxplot and a histogram along the same scale.
def histogram_boxplot(data, feature, figsize=(12, 7), kde=False, bins=None):
"""
Boxplot and histogram combined
data: dataframe
feature: dataframe column
figsize: size of figure (default (12,7))
kde: whether to the show density curve (default False)
bins: number of bins for histogram (default None)
"""
f2, (ax_box2, ax_hist2) = plt.subplots(
nrows=2, # Number of rows of the subplot grid= 2
sharex=True, # x-axis will be shared among all subplots
gridspec_kw={"height_ratios": (0.25, 0.75)},
figsize=figsize,
) # creating the 2 subplots
sns.boxplot(
data=data, x=feature, ax=ax_box2, showmeans=True, color="violet"
) # boxplot will be created and a star will indicate the mean value of the column
sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2, bins=bins, palette="winter"
) if bins else sns.histplot(
data=data, x=feature, kde=kde, ax=ax_hist2
) # For histogram
ax_hist2.axvline(
data[feature].mean(), color="green", linestyle="--"
) # Add mean to the histogram
ax_hist2.axvline(
data[feature].median(), color="black", linestyle="-"
) # Add median to the histogram
# Observations on Customer_age
histogram_boxplot(df1, "Customer_Age")
# Observations on Dependent_count
histogram_boxplot(df1, "Dependent_count")
# Observations on Months_on_book
histogram_boxplot(df1, "Months_on_book")
# Observations on Total_Relationship_Count
histogram_boxplot(df1, "Total_Relationship_Count")
# Observations on Months_Inactive_12_mon
histogram_boxplot(df1, "Months_Inactive_12_mon")
# Observations on Customer_age
histogram_boxplot(df1, "Contacts_Count_12_mon")
# Observations on Credit_Limit
histogram_boxplot(df1, "Credit_Limit")
# Observations on Total_Revolving_Bal
histogram_boxplot(df1, "Total_Revolving_Bal")
# Observations on Avg_Open_To_Buy
histogram_boxplot(df1, "Avg_Open_To_Buy")
# Observations on Total_Amt_Chng_Q4_Q1
histogram_boxplot(df1, "Total_Amt_Chng_Q4_Q1")
# Observations on Total_Trans_Amt
histogram_boxplot(df1, "Total_Trans_Amt")
# Observations on Total_Trans_Ct
histogram_boxplot(df1, "Total_Trans_Ct")
# Observations on Total_Ct_Chng_Q4_Q1
histogram_boxplot(df1, "Total_Ct_Chng_Q4_Q1")
# Observations on Avg_Utilization_Ratio
histogram_boxplot(df1, "Avg_Utilization_Ratio")
# function to create labeled barplots
def labeled_barplot(data, feature, perc=False, n=None):
"""
Barplot with percentage at the top
data: dataframe
feature: dataframe column
perc: whether to display percentages instead of count (default is False)
n: displays the top n category levels (default is None, i.e., display all levels)
"""
total = len(data[feature]) # length of the column
count = data[feature].nunique()
if n is None:
plt.figure(figsize=(count + 1, 5))
else:
plt.figure(figsize=(n + 1, 5))
plt.xticks(rotation=90, fontsize=15)
ax = sns.countplot(
data=data,
x=feature,
palette="Paired",
order=data[feature].value_counts().index[:n].sort_values(),
)
for p in ax.patches:
if perc == True:
label = "{:.1f}%".format(
100 * p.get_height() / total
) # percentage of each class of the category
else:
label = p.get_height() # count of each level of the category
x = p.get_x() + p.get_width() / 2 # width of the plot
y = p.get_height() # height of the plot
ax.annotate(
label,
(x, y),
ha="center",
va="center",
size=12,
xytext=(0, 5),
textcoords="offset points",
) # annotate the percentage
plt.show() # show the plot
# observations on Risk
labeled_barplot(df1, "Attrition_Flag")
# observations on Gender
labeled_barplot(df1, "Gender")
# observations on Education_Level
labeled_barplot(df1, "Education_Level")
# observations on Marital_Status
labeled_barplot(df1, "Marital_Status")
# observations on Income_Category
labeled_barplot(df1, "Income_Category")
# observations on Card_Category
labeled_barplot(df1, "Card_Category")
sns.pairplot(df1, hue="Attrition_Flag")
<seaborn.axisgrid.PairGrid at 0x7f875829be50>
# boxplot comparing Customer Age and Attrition Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Customer_Age", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Customer_Age'>
# boxplot comparing Dependent_Count and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Dependent_count", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Dependent_count'>
# boxplot comparing Months_on_book and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Months_on_book", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Months_on_book'>
# boxplot comparing Total_Relationship_Count and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(
x="Attrition_Flag", y="Total_Relationship_Count", data=df1, orient="vertical"
)
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Relationship_Count'>
# boxplot comparing Months_Inactive_12_mon and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Months_Inactive_12_mon", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Months_Inactive_12_mon'>
# boxplot comparing Contacts_Count_12_mon and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Contacts_Count_12_mon", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Contacts_Count_12_mon'>
# boxplot comparing Credit_Limit and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Credit_Limit", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Credit_Limit'>
# boxplot comparing Total_Revolving_Bal and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Revolving_Bal", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Revolving_Bal'>
# boxplot comparing Avg_Open_To_Buy and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Avg_Open_To_Buy", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Avg_Open_To_Buy'>
# boxplot comparing Total_Amt_Chng_Q4_Q1 and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Amt_Chng_Q4_Q1", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Amt_Chng_Q4_Q1'>
# boxplot comparing Total_Trans_Amt and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Amt", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Trans_Amt'>
# boxplot comparing Total_Trans_Ct and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Trans_Ct", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Trans_Ct'>
# boxplot comparing Total_Ct_Chng_Q4_Q1 and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Total_Ct_Chng_Q4_Q1", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Total_Ct_Chng_Q4_Q1'>
# boxplot comparing Avg_Utilization_Ratio and Attrition_Flag
sns.set(rc={"figure.figsize": (10, 7)})
sns.boxplot(x="Attrition_Flag", y="Avg_Utilization_Ratio", data=df1, orient="vertical")
<AxesSubplot:xlabel='Attrition_Flag', ylabel='Avg_Utilization_Ratio'>
# function to plot stacked bar chart
def stacked_barplot(data, predictor, target):
"""
Print the category counts and plot a stacked bar chart
data: dataframe
predictor: independent variable
target: target variable
"""
count = data[predictor].nunique()
sorter = data[target].value_counts().index[-1]
tab1 = pd.crosstab(data[predictor], data[target], margins=True).sort_values(
by=sorter, ascending=False
)
print(tab1)
print("-" * 120)
tab = pd.crosstab(data[predictor], data[target], normalize="index").sort_values(
by=sorter, ascending=False
)
tab.plot(kind="bar", stacked=True, figsize=(count + 1, 5))
plt.legend(
loc="lower left", frameon=False,
)
plt.legend(loc="upper left", bbox_to_anchor=(1, 1))
plt.show()
# stacked barplot of Gender and Attrition_Flag
stacked_barplot(df1, "Gender", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Gender All 1627 8500 10127 F 930 4428 5358 M 697 4072 4769 ------------------------------------------------------------------------------------------------------------------------
# stacked barplot of Education_Level and Attrition_Flag
stacked_barplot(df1, "Education_Level", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Education_Level All 1371 7237 8608 Graduate 487 2641 3128 High School 306 1707 2013 Uneducated 237 1250 1487 College 154 859 1013 Doctorate 95 356 451 Post-Graduate 92 424 516 ------------------------------------------------------------------------------------------------------------------------
# stacked barplot of Marital_Status and Attrition_Flag
stacked_barplot(df1, "Marital_Status", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Marital_Status All 1498 7880 9378 Married 709 3978 4687 Single 668 3275 3943 Divorced 121 627 748 ------------------------------------------------------------------------------------------------------------------------
# stacked barplot of Income_Category and Attrition_Flag
stacked_barplot(df1, "Income_Category", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Income_Category All 1627 8500 10127 Less than $40K 799 3874 4673 $40K - $60K 271 1519 1790 $80K - $120K 242 1293 1535 $60K - $80K 189 1213 1402 $120K + 126 601 727 ------------------------------------------------------------------------------------------------------------------------
# stacked barplot of Card_Category and Attrition_Flag
stacked_barplot(df1, "Card_Category", "Attrition_Flag")
Attrition_Flag Attrited Customer Existing Customer All Card_Category All 1627 8500 10127 Blue 1519 7917 9436 Silver 82 473 555 Gold 21 95 116 Platinum 5 15 20 ------------------------------------------------------------------------------------------------------------------------
plt.figure(figsize=(15, 7))
sns.heatmap(df1.corr(), annot=True, vmin=-1, vmax=1, fmt=".2f", cmap="Spectral")
plt.show()
# shape of data
df1.shape
(10127, 21)
# dropping column
df1.drop("CLIENTNUM", axis=1, inplace=True)
df1.shape
(10127, 20)
# Copy of data
df2 = df1.copy()
# assign encoder
attrition_encoder = LabelEncoder()
# encode Attrition flag
attrition_encoder.fit(df2["Attrition_Flag"])
LabelEncoder()
# transform attrition flag
attrition_values = attrition_encoder.transform(df2["Attrition_Flag"])
print(attrition_values)
[1 1 1 ... 0 0 0]
# creating new column
Existing_Customer = pd.DataFrame(attrition_values, columns=["Existing_Customer"])
# concat data set
df_categorical_encoded = pd.concat([Existing_Customer], axis=1)
# confirm new dataset tail
df_categorical_encoded.tail()
| Existing_Customer | |
|---|---|
| 10122 | 1 |
| 10123 | 0 |
| 10124 | 0 |
| 10125 | 0 |
| 10126 | 0 |
df2.tail()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | Existing Customer | 50 | M | 2 | Graduate | Single | $40K - $60K | Blue | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | Attrited Customer | 41 | M | 2 | NaN | Divorced | $40K - $60K | Blue | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | Attrited Customer | 44 | F | 1 | High School | Married | Less than $40K | Blue | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | Attrited Customer | 30 | M | 2 | Graduate | NaN | $40K - $60K | Blue | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | Attrited Customer | 43 | F | 2 | Graduate | Married | Less than $40K | Silver | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
df3 = pd.concat([df2, df_categorical_encoded], axis=1)
df3.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Existing_Customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Existing Customer | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 1 |
| 1 | Existing Customer | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 1 |
| 2 | Existing Customer | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 1 |
| 3 | Existing Customer | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 1 |
| 4 | Existing Customer | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 | 1 |
df3.drop("Attrition_Flag", axis=1, inplace=True)
df3.shape
(10127, 20)
df3.head()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Existing_Customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 1 |
| 1 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 1 |
| 2 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 1 |
| 3 | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 1 |
| 4 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 | 1 |
X = df3.drop(["Existing_Customer"], axis=1)
y = df3["Existing_Customer"]
# Splitting data into training, validation and test sets:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
# Let's impute the missing values
imp_mode = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
# fit the imputer on train data and transform the train data
X_train["Education_Level"] = imp_mode.fit_transform(X_train[["Education_Level"]])
X_train["Marital_Status"] = imp_mode.fit_transform(X_train[["Marital_Status"]])
# transform the validation and test data using the imputer fit on train data
X_val["Education_Level"] = imp_mode.transform(X_val[["Education_Level"]])
X_val["Marital_Status"] = imp_mode.transform(X_val[["Marital_Status"]])
X_test["Education_Level"] = imp_mode.transform(X_test[["Education_Level"]])
X_test["Marital_Status"] = imp_mode.transform(X_test[["Marital_Status"]])
# defining a list with names of columns that will be used for imputation
reqd_col_for_impute = [
"Education_Level",
"Marital_Status",
]
df2[reqd_col_for_impute].head()
| Education_Level | Marital_Status | |
|---|---|---|
| 0 | High School | Married |
| 1 | Graduate | Single |
| 2 | Graduate | Married |
| 3 | High School | NaN |
| 4 | Uneducated | Married |
# Creating dummy variables for categorical variables
X_train = pd.get_dummies(data=X_train, drop_first=True)
X_val = pd.get_dummies(data=X_val, drop_first=True)
X_test = pd.get_dummies(data=X_test, drop_first=True)
X_train.head()
| Customer_Age | Dependent_count | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Gender_M | Education_Level_Doctorate | Education_Level_Graduate | Education_Level_High School | Education_Level_Post-Graduate | Education_Level_Uneducated | Marital_Status_Married | Marital_Status_Single | Income_Category_$40K - $60K | Income_Category_$60K - $80K | Income_Category_$80K - $120K | Income_Category_Less than $40K | Card_Category_Gold | Card_Category_Platinum | Card_Category_Silver | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9501 | 47 | 2 | 37 | 1 | 2 | 2 | 21714.000 | 1969 | 19745.000 | 0.944 | 13270 | 104 | 0.625 | 0.091 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 5065 | 49 | 4 | 42 | 5 | 1 | 4 | 7789.000 | 957 | 6832.000 | 0.724 | 3412 | 70 | 0.842 | 0.123 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2375 | 53 | 2 | 36 | 6 | 1 | 3 | 3176.000 | 1470 | 1706.000 | 0.388 | 1634 | 53 | 0.472 | 0.463 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 7579 | 56 | 2 | 45 | 3 | 3 | 1 | 3296.000 | 1435 | 1861.000 | 0.968 | 4327 | 66 | 0.737 | 0.435 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
| 2776 | 47 | 4 | 36 | 3 | 3 | 3 | 17557.000 | 0 | 17557.000 | 0.667 | 2142 | 62 | 0.378 | 0.000 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
models = [] # Empty list to store all the models
# Appending models into the list
models.append(("Bagging", BaggingClassifier(random_state=1)))
models.append(("Random forest", RandomForestClassifier(random_state=1)))
models.append(("GBM", GradientBoostingClassifier(random_state=1)))
models.append(("Adaboost", AdaBoostClassifier(random_state=1)))
models.append(("Xgboost", XGBClassifier(random_state=1, eval_metric="logloss")))
models.append(("dtree", DecisionTreeClassifier(random_state=1)))
results = [] # Empty list to store all model's CV scores
names = [] # Empty list to store name of the models
# loop through all models to get the mean cross validated score
print("\n" "Cross-Validation Performance:" "\n")
for name, model in models:
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result = cross_val_score(
estimator=model, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
results.append(cv_result)
names.append(name)
print("{}: {}".format(name, cv_result.mean() * 100))
print("\n" "Training Performance:" "\n")
for name, model in models:
model.fit(X_train, y_train)
scores = recall_score(y_train, model.predict(X_train)) * 100
print("{}: {}".format(name, scores))
Cross-Validation Performance: Bagging: 97.31355231003099 Random forest: 98.52925782678135 GBM: 98.7841405453251 Adaboost: 97.92122226712078 Xgboost: 98.62718158902423 dtree: 96.37199099463143 Training Performance: Bagging: 99.76465973720337 Random forest: 100.0 GBM: 99.21553245734458 Adaboost: 98.25455971759169 Xgboost: 100.0 dtree: 100.0
imputer = KNNImputer(n_neighbors=5)
# we need to pass numerical values for each categorical column for KNN imputation so we will label encode them
Attrition_Flag = {"Existing Customer": 0, "Attrited Customer":1}
df2["Attrition_Flag"] = df2["Attrition_Flag"].map(Attrition_Flag)
Gender = {"M": 0, "F": 1}
df2["Gender"] = df2["Gender"].map(Gender)
Education_Level = {
"Graduate": 0,
"High School": 1,
"Uneducated": 2,
"Post-Graduate": 3,
"Doctorate": 4,
}
df2["Education_Level"] = df2["Education_Level"].map(Education_Level)
Marital_Status = {
"Married": 0,
"Single": 1,
"Divorced": 2,
}
df2["Marital_Status"] = df2["Marital_Status"].map(Marital_Status)
Income_Category = {
"Less than $40K": 0,
"$40K - $60K": 1,
"$80K - $120K": 2,
"$60K - $80K": 3,
"$120K +": 4,
}
df2["Income_Category"] = df2["Income_Category"].map(Income_Category)
Card_Category = {
"Blue": 0,
"Silver": 1,
"Gold": 2,
"Platinum": 3,
}
df2["Card_Category"] = df2["Card_Category"].map(Card_Category)
df2.head()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 45 | 0 | 3 | 1.000 | 0.000 | 3 | 0 | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 |
| 1 | 0 | 49 | 1 | 5 | 0.000 | 1.000 | 0 | 0 | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 |
| 2 | 0 | 51 | 0 | 3 | 0.000 | 0.000 | 2 | 0 | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 |
| 3 | 0 | 40 | 1 | 4 | 1.000 | NaN | 0 | 0 | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 |
| 4 | 0 | 40 | 0 | 3 | 2.000 | 0.000 | 3 | 0 | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 |
df2.tail()
| Attrition_Flag | Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 10122 | 0 | 50 | 0 | 2 | 0.000 | 1.000 | 1 | 0 | 40 | 3 | 2 | 3 | 4003.000 | 1851 | 2152.000 | 0.703 | 15476 | 117 | 0.857 | 0.462 |
| 10123 | 1 | 41 | 0 | 2 | NaN | 2.000 | 1 | 0 | 25 | 4 | 2 | 3 | 4277.000 | 2186 | 2091.000 | 0.804 | 8764 | 69 | 0.683 | 0.511 |
| 10124 | 1 | 44 | 1 | 1 | 1.000 | 0.000 | 0 | 0 | 36 | 5 | 3 | 4 | 5409.000 | 0 | 5409.000 | 0.819 | 10291 | 60 | 0.818 | 0.000 |
| 10125 | 1 | 30 | 0 | 2 | 0.000 | NaN | 1 | 0 | 36 | 4 | 3 | 3 | 5281.000 | 0 | 5281.000 | 0.535 | 8395 | 62 | 0.722 | 0.000 |
| 10126 | 1 | 43 | 1 | 2 | 0.000 | 0.000 | 0 | 1 | 25 | 6 | 2 | 4 | 10388.000 | 1961 | 8427.000 | 0.703 | 10294 | 61 | 0.649 | 0.189 |
X = df2.drop(["Attrition_Flag"], axis=1)
y = df2["Attrition_Flag"]
# Splitting data into training, validation and test set:
# first we split data into 2 parts, say temporary and test
X_temp, X_test, y_temp, y_test = train_test_split(
X, y, test_size=0.2, random_state=1, stratify=y
)
# then we split the temporary set into train and validation
X_train, X_val, y_train, y_val = train_test_split(
X_temp, y_temp, test_size=0.25, random_state=1, stratify=y_temp
)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 19) (2026, 19) (2026, 19)
print("Number of rows in train data =", X_train.shape[0])
print("Number of rows in validation data =", X_val.shape[0])
print("Number of rows in test data =", X_test.shape[0])
Number of rows in train data = 6075 Number of rows in validation data = 2026 Number of rows in test data = 2026
# Fit and transform the train data
X_train[reqd_col_for_impute] = imputer.fit_transform(X_train[reqd_col_for_impute])
# Transform the train data
X_val[reqd_col_for_impute] = imputer.fit_transform(X_val[reqd_col_for_impute])
# Transform the test data
X_test[reqd_col_for_impute] = imputer.transform(X_test[reqd_col_for_impute])
# Checking that no column has missing values in train, validation or test sets
print(X_train.isna().sum())
print("-" * 30)
print(X_val.isna().sum())
print("-" * 30)
print(X_test.isna().sum())
Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64 ------------------------------ Customer_Age 0 Gender 0 Dependent_count 0 Education_Level 0 Marital_Status 0 Income_Category 0 Card_Category 0 Months_on_book 0 Total_Relationship_Count 0 Months_Inactive_12_mon 0 Contacts_Count_12_mon 0 Credit_Limit 0 Total_Revolving_Bal 0 Avg_Open_To_Buy 0 Total_Amt_Chng_Q4_Q1 0 Total_Trans_Amt 0 Total_Trans_Ct 0 Total_Ct_Chng_Q4_Q1 0 Avg_Utilization_Ratio 0 dtype: int64
## Function to inverse the encoding
def inverse_mapping(x, y):
inv_dict = {v: k for k, v in x.items()}
X_train[y] = np.round(X_train[y]).map(inv_dict).astype("category")
X_val[y] = np.round(X_val[y]).map(inv_dict).astype("category")
X_test[y] = np.round(X_test[y]).map(inv_dict).astype("category")
inverse_mapping(Gender, "Gender")
inverse_mapping(Education_Level, "Education_Level")
inverse_mapping(Marital_Status, "Marital_Status")
inverse_mapping(Income_Category, "Income_Category")
inverse_mapping(Card_Category, "Card_Category")
cols = X_train.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_train[i].value_counts())
print("*" * 30)
F 3193 M 2882 Name: Gender, dtype: int64 ****************************** High School 2774 Graduate 1854 Uneducated 881 Post-Graduate 312 Doctorate 254 Name: Education_Level, dtype: int64 ****************************** Married 2985 Single 2660 Divorced 430 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 2783 $40K - $60K 1059 $80K - $120K 953 $60K - $80K 831 $120K + 449 Name: Income_Category, dtype: int64 ****************************** Blue 5655 Silver 339 Gold 69 Platinum 12 Name: Card_Category, dtype: int64 ******************************
cols = X_val.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_val[i].value_counts())
print("*" * 30)
F 1095 M 931 Name: Gender, dtype: int64 ****************************** High School 664 Graduate 623 Uneducated 539 Post-Graduate 101 Doctorate 99 Name: Education_Level, dtype: int64 ****************************** Married 976 Single 894 Divorced 156 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 957 $40K - $60K 361 $80K - $120K 293 $60K - $80K 279 $120K + 136 Name: Income_Category, dtype: int64 ****************************** Blue 1905 Silver 97 Gold 21 Platinum 3 Name: Card_Category, dtype: int64 ******************************
cols = X_test.select_dtypes(include=["object", "category"])
for i in cols.columns:
print(X_test[i].value_counts())
print("*" * 30)
F 1070 M 956 Name: Gender, dtype: int64 ****************************** Graduate 651 High School 649 Uneducated 525 Post-Graduate 103 Doctorate 98 Name: Education_Level, dtype: int64 ****************************** Single 938 Married 926 Divorced 162 Name: Marital_Status, dtype: int64 ****************************** Less than $40K 933 $40K - $60K 370 $60K - $80K 292 $80K - $120K 289 $120K + 142 Name: Income_Category, dtype: int64 ****************************** Blue 1876 Silver 119 Gold 26 Platinum 5 Name: Card_Category, dtype: int64 ******************************
X_train = pd.get_dummies(X_train, drop_first=True)
X_val = pd.get_dummies(X_val, drop_first=True)
X_test = pd.get_dummies(X_test, drop_first=True)
print(X_train.shape, X_val.shape, X_test.shape)
(6075, 28) (2026, 28) (2026, 28)
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
lr = LogisticRegression(random_state=1)
lr.fit(X_train, y_train)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=lr, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
log_reg_model_train_perf = model_performance_classification_sklearn(
lr, X_train, y_train
)
print("Training performance:")
log_reg_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.873 | 0.405 | 0.674 | 0.506 |
# Calculating different metrics on validation set
log_reg_model_val_perf = model_performance_classification_sklearn(lr, X_val, y_val)
print("Validation performance:")
log_reg_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.884 | 0.482 | 0.704 | 0.572 |
# creating confusion matrix
confusion_matrix_sklearn(lr, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
log_reg_over = LogisticRegression(random_state=1)
# Training the basic logistic regression model with training set
log_reg_over.fit(X_train_over, y_train_over)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=log_reg_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
log_reg_over_train_perf = model_performance_classification_sklearn(
log_reg_over, X_train_over, y_train_over
)
print("Training performance:")
log_reg_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.793 | 0.780 | 0.802 | 0.791 |
# Calculating different metrics on validation set
log_reg_over_val_perf = model_performance_classification_sklearn(
log_reg_over, X_val, y_val
)
print("validation performance:")
log_reg_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.804 | 0.770 | 0.437 | 0.558 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
log_reg_under = LogisticRegression(random_state=1)
log_reg_under.fit(X_train_un, y_train_un)
LogisticRegression(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=log_reg_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
log_reg_under_train_perf = model_performance_classification_sklearn(
log_reg_under, X_train_un, y_train_un
)
print("Training performance:")
log_reg_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.827 | 0.836 | 0.822 | 0.829 |
# Calculating different metrics on validation set
log_reg_under_val_perf = model_performance_classification_sklearn(
log_reg_under, X_val, y_val
)
print("Validation performance:")
log_reg_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.823 | 0.840 | 0.472 | 0.605 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg_under, X_val, y_val)
# Calculating different metrics on train set
log_reg_reg_train_perf = model_performance_classification_sklearn(
lr, X_train_over, y_train_over
)
print("Training performance:")
log_reg_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.664 | 0.366 | 0.907 | 0.522 |
# Calculating different metrics on validation set
log_reg_reg_val_perf = model_performance_classification_sklearn(lr, X_val, y_val)
print("Validation performance:")
log_reg_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.884 | 0.482 | 0.704 | 0.572 |
# creating confusion matrix
confusion_matrix_sklearn(lr, X_val, y_val)
# Choose the type of classifier.
lr_estimator = LogisticRegression(random_state=1, solver="saga")
# Grid of parameters to choose from
parameters = {"C": np.arange(0.1, 1.1, 0.1)}
# Run the grid search
grid_obj = GridSearchCV(lr_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
lr_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
lr_estimator.fit(X_train_over, y_train_over)
LogisticRegression(C=0.1, random_state=1, solver='saga')
# Calculating different metrics on train set
log_reg_reg_train_perf = model_performance_classification_sklearn(
lr_estimator, X_train_over, y_train_over
)
print("Training performance:")
log_reg_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.698 | 0.550 | 0.781 | 0.646 |
# Calculating different metrics on validation set
log_reg_reg_val_perf = model_performance_classification_sklearn(
lr_estimator, X_val, y_val
)
print("Validation performance:")
log_reg_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.796 | 0.595 | 0.408 | 0.484 |
# training performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_train_perf.T,
log_reg_over_train_perf.T,
log_reg_reg_train_perf.T,
log_reg_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression",
"Logistic Regression with oversampled data",
"Regularised Logistic Regression",
"Logistic Regression with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Logistic Regression | Logistic Regression with oversampled data | Regularised Logistic Regression | Logistic Regression with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.873 | 0.793 | 0.698 | 0.827 |
| Recall | 0.405 | 0.780 | 0.550 | 0.836 |
| Precision | 0.674 | 0.802 | 0.781 | 0.822 |
| F1 | 0.506 | 0.791 | 0.646 | 0.829 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
log_reg_model_val_perf.T,
log_reg_over_val_perf.T,
log_reg_reg_val_perf.T,
log_reg_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Logistic Regression",
"Logistic Regression with oversampled data",
"Regularised Logistic Regression",
"Logistic Regression with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| Logistic Regression | Logistic Regression with oversampled data | Regularised Logistic Regression | Logistic Regression with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.884 | 0.804 | 0.796 | 0.823 |
| Recall | 0.482 | 0.770 | 0.595 | 0.840 |
| Precision | 0.704 | 0.437 | 0.408 | 0.472 |
| F1 | 0.572 | 0.558 | 0.484 | 0.605 |
# Calculating different metrics on validation set
log_reg_under_test_perf = model_performance_classification_sklearn(
log_reg_under, X_test, y_test
)
print("Test performance:")
log_reg_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.812 | 0.852 | 0.454 | 0.593 |
# creating confusion matrix
confusion_matrix_sklearn(log_reg_under, X_test, y_test)
d_tree = DecisionTreeClassifier(random_state=1)
d_tree.fit(X_train, y_train)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=d_tree, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
d_tree_model_train_perf = model_performance_classification_sklearn(
d_tree, X_train, y_train
)
print("Training performance:")
d_tree_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
d_tree_model_val_perf = model_performance_classification_sklearn(d_tree, X_val, y_val)
print("Validation performance:")
d_tree_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.942 | 0.828 | 0.813 | 0.821 |
# creating confusion matrix
confusion_matrix_sklearn(d_tree, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
d_tree_over = DecisionTreeClassifier(random_state=1)
# Training the d_tree_over model with training set
d_tree_over.fit(X_train_over, y_train_over)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=d_tree_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
d_tree_over_train_perf = model_performance_classification_sklearn(
d_tree_over, X_train_over, y_train_over
)
print("Training performance:")
d_tree_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
d_tree_over_val_perf = model_performance_classification_sklearn(
d_tree_over, X_val, y_val
)
print("validation performance:")
d_tree_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.925 | 0.853 | 0.730 | 0.786 |
# creating confusion matrix
confusion_matrix_sklearn(d_tree_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
d_tree_under = DecisionTreeClassifier(random_state=1)
d_tree_under.fit(X_train_un, y_train_un)
DecisionTreeClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=d_tree_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
d_tree_under_train_perf = model_performance_classification_sklearn(
d_tree_under, X_train_un, y_train_un
)
print("Training performance:")
d_tree_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
d_tree_under_val_perf = model_performance_classification_sklearn(
d_tree_under, X_val, y_val
)
print("Validation performance:")
d_tree_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.890 | 0.899 | 0.608 | 0.725 |
# creating confusion matrix
confusion_matrix_sklearn(d_tree_under, X_val, y_val)
# Calculating different metrics on train set
d_tree_reg_train_perf = model_performance_classification_sklearn(
d_tree, X_train_over, y_train_over
)
print("Training performance:")
d_tree_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.955 | 0.910 | 1.000 | 0.953 |
# Calculating different metrics on validation set
d_tree_reg_val_perf = model_performance_classification_sklearn(d_tree, X_val, y_val)
print("Validation performance:")
d_tree_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.942 | 0.828 | 0.813 | 0.821 |
# creating confusion matrix
confusion_matrix_sklearn(d_tree, X_val, y_val)
# Choose the type of classifier.
d_tree_estimator = DecisionTreeClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"max_depth": np.arange(2, 10),
"min_samples_leaf": [5, 7, 10, 15],
"max_leaf_nodes": [2, 3, 5, 10, 15],
"min_impurity_decrease": [0.0001, 0.001, 0.01, 0.1],
}
# Run the grid search
grid_obj = GridSearchCV(d_tree_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
d_tree_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
d_tree_estimator.fit(X_train_over, y_train_over)
DecisionTreeClassifier(max_depth=3, max_leaf_nodes=5,
min_impurity_decrease=0.0001, min_samples_leaf=5,
random_state=1)
# Calculating different metrics on train set
d_tree_reg_train_perf = model_performance_classification_sklearn(
d_tree_estimator, X_train_over, y_train_over
)
print("Training performance:")
d_tree_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.841 | 0.948 | 0.781 | 0.856 |
# Calculating different metrics on validation set
d_tree_reg_val_perf = model_performance_classification_sklearn(
d_tree_estimator, X_val, y_val
)
print("Validation performance:")
d_tree_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.762 | 0.908 | 0.395 | 0.551 |
# training performance comparison
models_train_comp_df = pd.concat(
[
d_tree_model_train_perf.T,
d_tree_over_train_perf.T,
d_tree_reg_train_perf.T,
d_tree_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree",
"Decision Tree with oversampled data",
"Regularised Decision Tree",
"Decision Tree with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Decision Tree | Decision Tree with oversampled data | Regularised Decision Tree | Decision Tree with undersampled data | |
|---|---|---|---|---|
| Accuracy | 1.000 | 1.000 | 0.841 | 1.000 |
| Recall | 1.000 | 1.000 | 0.948 | 1.000 |
| Precision | 1.000 | 1.000 | 0.781 | 1.000 |
| F1 | 1.000 | 1.000 | 0.856 | 1.000 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
d_tree_model_val_perf.T,
d_tree_over_val_perf.T,
d_tree_reg_val_perf.T,
d_tree_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Decision Tree",
"Decision Tree with oversampled data",
"Regularised Decision Tree",
"Decision Tree with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| Decision Tree | Decision Tree with oversampled data | Regularised Decision Tree | Decision Tree with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.942 | 0.925 | 0.762 | 0.890 |
| Recall | 0.828 | 0.853 | 0.908 | 0.899 |
| Precision | 0.813 | 0.730 | 0.395 | 0.608 |
| F1 | 0.821 | 0.786 | 0.551 | 0.725 |
# Calculating different metrics on validation set
d_tree_under_test_perf = model_performance_classification_sklearn(
d_tree_under, X_test, y_test
)
print("Test performance:")
d_tree_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.886 | 0.923 | 0.593 | 0.722 |
# creating confusion matrix
confusion_matrix_sklearn(d_tree_under, X_test, y_test)
rf = RandomForestClassifier(random_state=1)
rf.fit(X_train, y_train)
RandomForestClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=rf, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
rf_model_train_perf = model_performance_classification_sklearn(rf, X_train, y_train)
print("Training performance:")
rf_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
rf_model_val_perf = model_performance_classification_sklearn(rf, X_val, y_val)
print("Validation performance:")
rf_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958 | 0.801 | 0.929 | 0.860 |
# creating confusion matrix
confusion_matrix_sklearn(rf, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
rf_over = RandomForestClassifier(random_state=1)
# Training the rf_over model with training set
rf_over.fit(X_train_over, y_train_over)
RandomForestClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=rf_over, X=X_train_over, y=y_train_over, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
rf_over_train_perf = model_performance_classification_sklearn(
rf_over, X_train_over, y_train_over
)
print("Training performance:")
rf_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
rf_over_val_perf = model_performance_classification_sklearn(rf_over, X_val, y_val)
print("validation performance:")
rf_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.953 | 0.859 | 0.851 | 0.855 |
# creating confusion matrix
confusion_matrix_sklearn(rf_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
rf_under = RandomForestClassifier(random_state=1)
rf_under.fit(X_train_un, y_train_un)
RandomForestClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=rf_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
rf_under_train_perf = model_performance_classification_sklearn(
rf_under, X_train_un, y_train_un
)
print("Training performance:")
rf_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
rf_under_val_perf = model_performance_classification_sklearn(rf_under, X_val, y_val)
print("Validation performance:")
rf_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.934 | 0.936 | 0.730 | 0.820 |
# creating confusion matrix
confusion_matrix_sklearn(rf_under, X_val, y_val)
# Calculating different metrics on train set
rf_reg_train_perf = model_performance_classification_sklearn(
rf, X_train_over, y_train_over
)
print("Training performance:")
rf_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.953 | 0.906 | 1.000 | 0.950 |
# Calculating different metrics on validation set
rf_reg_val_perf = model_performance_classification_sklearn(rf, X_val, y_val)
print("Validation performance:")
rf_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.958 | 0.801 | 0.929 | 0.860 |
# creating confusion matrix
confusion_matrix_sklearn(rf, X_val, y_val)
# Choose the type of classifier.
rf_estimator = RandomForestClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": [150, 200, 250],
"min_samples_leaf": np.arange(5, 10),
"max_features": np.arange(0.2, 0.7, 0.1),
"max_samples": np.arange(0.3, 0.7, 0.1),
}
# Run the grid search
grid_obj = GridSearchCV(rf_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
rf_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
rf_estimator.fit(X_train_over, y_train_over)
RandomForestClassifier(max_features=0.4000000000000001,
max_samples=0.6000000000000001, min_samples_leaf=5,
n_estimators=150, random_state=1)
# Calculating different metrics on train set
rf_reg_train_perf = model_performance_classification_sklearn(
rf_estimator, X_train_over, y_train_over
)
print("Training performance:")
rf_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.986 | 0.992 | 0.980 | 0.986 |
# Calculating different metrics on validation set
rf_reg_val_perf = model_performance_classification_sklearn(rf_estimator, X_val, y_val)
print("Validation performance:")
rf_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.949 | 0.911 | 0.801 | 0.852 |
# training performance comparison
models_train_comp_rf = pd.concat(
[
rf_model_train_perf.T,
rf_over_train_perf.T,
rf_reg_train_perf.T,
rf_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Random Forest",
"Random Forest with oversampled data",
"Regularised Random Forest",
"Random Forest with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Random Forest | Random Forest with oversampled data | Regularised Random Forest | Random Forest with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.942 | 0.925 | 0.762 | 0.890 |
| Recall | 0.828 | 0.853 | 0.908 | 0.899 |
| Precision | 0.813 | 0.730 | 0.395 | 0.608 |
| F1 | 0.821 | 0.786 | 0.551 | 0.725 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[rf_model_val_perf.T, rf_over_val_perf.T, rf_reg_val_perf.T, rf_under_val_perf.T,],
axis=1,
)
models_train_comp_df.columns = [
"Random Forest",
"Random Forest with oversampled data",
"Regularised Random Forest",
"Random Forest with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| Random Forest | Random Forest with oversampled data | Regularised Random Forest | Random Forest with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.958 | 0.953 | 0.949 | 0.934 |
| Recall | 0.801 | 0.859 | 0.911 | 0.936 |
| Precision | 0.929 | 0.851 | 0.801 | 0.730 |
| F1 | 0.860 | 0.855 | 0.852 | 0.820 |
# Calculating different metrics on validation set
rf_under_test_perf = model_performance_classification_sklearn(rf_under, X_test, y_test)
print("Test performance:")
rf_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.927 | 0.963 | 0.699 | 0.810 |
# creating confusion matrix
confusion_matrix_sklearn(rf_under, X_test, y_test)
bagging_classifier = BaggingClassifier(random_state=1)
bagging_classifier.fit(X_train, y_train)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=bagging_classifier, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
bagging_classifier_model_train_perf = model_performance_classification_sklearn(
bagging_classifier, X_train, y_train
)
print("Training performance:")
bagging_classifier_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.997 | 0.986 | 0.997 | 0.991 |
# Calculating different metrics on validation set
bagging_classifier_model_val_perf = model_performance_classification_sklearn(
bagging_classifier, X_val, y_val
)
print("Validation performance:")
bagging_classifier_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954 | 0.810 | 0.895 | 0.850 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_classifier, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
bagging_classifier_over = BaggingClassifier(random_state=1)
# Training the basic bagging_classifier_over model with training set
bagging_classifier_over.fit(X_train_over, y_train_over)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=bagging_classifier_over,
X=X_train_over,
y=y_train_over,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
bagging_classifier_over_train_perf = model_performance_classification_sklearn(
bagging_classifier_over, X_train_over, y_train_over
)
print("Training performance:")
bagging_classifier_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.998 | 0.998 | 0.999 | 0.998 |
# Calculating different metrics on validation set
bagging_classifier_over_val_perf = model_performance_classification_sklearn(
bagging_classifier_over, X_val, y_val
)
print("validation performance:")
bagging_classifier_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.945 | 0.865 | 0.806 | 0.834 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_classifier_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
bagging_classifier_under = BaggingClassifier(random_state=1)
bagging_classifier_under.fit(X_train_un, y_train_un)
BaggingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=bagging_classifier_under,
X=X_train_un,
y=y_train_un,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
bagging_classifier_under_train_perf = model_performance_classification_sklearn(
bagging_classifier_under, X_train_un, y_train_un
)
print("Training performance:")
bagging_classifier_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.994 | 0.991 | 0.998 | 0.994 |
# Calculating different metrics on validation set
bagging_classifier_under_val_perf = model_performance_classification_sklearn(
bagging_classifier_under, X_val, y_val
)
print("Validation performance:")
bagging_classifier_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.926 | 0.936 | 0.703 | 0.803 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_classifier_under, X_val, y_val)
# Calculating different metrics on train set
bagging_classifier_reg_train_perf = model_performance_classification_sklearn(
bagging_classifier, X_train_over, y_train_over
)
print("Training performance:")
bagging_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.957 | 0.915 | 0.999 | 0.955 |
# Calculating different metrics on validation set
bagging_classifier_reg_val_perf = model_performance_classification_sklearn(
bagging_classifier, X_val, y_val
)
print("Validation performance:")
bagging_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.954 | 0.810 | 0.895 | 0.850 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_classifier, X_val, y_val)
# Choose the type of classifier.
bagging_classifier_estimator = BaggingClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"max_samples": [0.7, 0.8, 0.9, 1],
"max_features": [0.7, 0.8, 0.9, 1],
"n_estimators": [10, 20, 30, 40, 50],
}
# Run the grid search
grid_obj = GridSearchCV(bagging_classifier_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
bagging_classifier_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
bagging_classifier_estimator.fit(X_train_over, y_train_over)
BaggingClassifier(max_features=0.7, max_samples=0.7, n_estimators=50,
random_state=1)
# Calculating different metrics on train set
bagging_classifier_reg_train_perf = model_performance_classification_sklearn(
bagging_classifier_estimator, X_train_over, y_train_over
)
print("Training performance:")
bagging_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 0.999 | 1.000 |
# Calculating different metrics on validation set
bagging_classifier_reg_val_perf = model_performance_classification_sklearn(
bagging_classifier_estimator, X_val, y_val
)
print("Validation performance:")
bagging_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.962 | 0.920 | 0.857 | 0.888 |
# training performance comparison
models_train_comp_bagging_classifier = pd.concat(
[
bagging_classifier_model_train_perf.T,
bagging_classifier_over_train_perf.T,
bagging_classifier_reg_train_perf.T,
bagging_classifier_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Bagging",
"Bagging with oversampled data",
"Regularised Bagging",
"Bagging with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Bagging | Bagging with oversampled data | Regularised Bagging | Bagging with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.958 | 0.953 | 0.949 | 0.934 |
| Recall | 0.801 | 0.859 | 0.911 | 0.936 |
| Precision | 0.929 | 0.851 | 0.801 | 0.730 |
| F1 | 0.860 | 0.855 | 0.852 | 0.820 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
bagging_classifier_model_val_perf.T,
bagging_classifier_over_val_perf.T,
bagging_classifier_reg_val_perf.T,
bagging_classifier_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Bagging",
"Bagging with oversampled data",
"Regularised Bagging",
"Bagging with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| Bagging | Bagging with oversampled data | Regularised Bagging | Bagging with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.954 | 0.945 | 0.962 | 0.926 |
| Recall | 0.810 | 0.865 | 0.920 | 0.936 |
| Precision | 0.895 | 0.806 | 0.857 | 0.703 |
| F1 | 0.850 | 0.834 | 0.888 | 0.803 |
# Calculating different metrics on validation set
bagging_classifier_under_test_perf = model_performance_classification_sklearn(
bagging_classifier_under, X_test, y_test
)
print("Test performance:")
bagging_classifier_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.920 | 0.951 | 0.678 | 0.791 |
# creating confusion matrix
confusion_matrix_sklearn(bagging_classifier_under, X_test, y_test)
ab_classifier = AdaBoostClassifier(random_state=1)
ab_classifier.fit(X_train, y_train)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=ab_classifier, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
ab_classifier_model_train_perf = model_performance_classification_sklearn(
ab_classifier, X_train, y_train
)
print("Training performance:")
ab_classifier_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.957 | 0.827 | 0.900 | 0.862 |
# Calculating different metrics on validation set
ab_classifier_model_val_perf = model_performance_classification_sklearn(
ab_classifier, X_val, y_val
)
print("Validation performance:")
ab_classifier_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.962 | 0.853 | 0.906 | 0.878 |
# creating confusion matrix
confusion_matrix_sklearn(ab_classifier, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
ab_classifier_over = AdaBoostClassifier(random_state=1)
# Training the ab_classifier_over model with training set
ab_classifier_over.fit(X_train_over, y_train_over)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=ab_classifier_over,
X=X_train_over,
y=y_train_over,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
ab_classifier_over_train_perf = model_performance_classification_sklearn(
ab_classifier_over, X_train_over, y_train_over
)
print("Training performance:")
ab_classifier_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.963 | 0.966 | 0.961 | 0.963 |
# Calculating different metrics on validation set
ab_classifier_over_val_perf = model_performance_classification_sklearn(
ab_classifier_over, X_val, y_val
)
print("validation performance:")
ab_classifier_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.946 | 0.880 | 0.804 | 0.840 |
# creating confusion matrix
confusion_matrix_sklearn(ab_classifier_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
ab_classifier_under = AdaBoostClassifier(random_state=1)
ab_classifier_under.fit(X_train_un, y_train_un)
AdaBoostClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=ab_classifier_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
ab_classifier_under_train_perf = model_performance_classification_sklearn(
ab_classifier_under, X_train_un, y_train_un
)
print("Training performance:")
ab_classifier_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.950 | 0.953 | 0.947 | 0.950 |
# Calculating different metrics on validation set
ab_classifier_under_val_perf = model_performance_classification_sklearn(
ab_classifier_under, X_val, y_val
)
print("Validation performance:")
ab_classifier_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.929 | 0.960 | 0.705 | 0.813 |
# creating confusion matrix
confusion_matrix_sklearn(ab_classifier_under, X_val, y_val)
# Calculating different metrics on train set
ab_classifier_reg_train_perf = model_performance_classification_sklearn(
ab_classifier, X_train_over, y_train_over
)
print("Training performance:")
ab_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.910 | 0.838 | 0.979 | 0.903 |
# Calculating different metrics on validation set
ab_classifier_reg_val_perf = model_performance_classification_sklearn(
ab_classifier, X_val, y_val
)
print("Validation performance:")
ab_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.962 | 0.853 | 0.906 | 0.878 |
# creating confusion matrix
confusion_matrix_sklearn(ab_classifier, X_val, y_val)
# Choose the type of classifier.
ab_classifier_estimator = AdaBoostClassifier(random_state=1)
# Grid of parameters to choose from
parameters = {
"n_estimators": np.arange(10, 100, 10),
"learning_rate": [1, 0.1, 0.5, 0.01],
}
# Run the grid search
grid_obj = GridSearchCV(ab_classifier_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
ab_classifier_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
ab_classifier_estimator.fit(X_train_over, y_train_over)
AdaBoostClassifier(learning_rate=1, n_estimators=90, random_state=1)
# Calculating different metrics on train set
ab_classifier_reg_train_perf = model_performance_classification_sklearn(
ab_classifier_estimator, X_train_over, y_train_over
)
print("Training performance:")
ab_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.970 | 0.972 | 0.969 | 0.970 |
# Calculating different metrics on validation set
ab_classifier_reg_val_perf = model_performance_classification_sklearn(
ab_classifier_estimator, X_val, y_val
)
print("Validation performance:")
ab_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.950 | 0.887 | 0.816 | 0.850 |
# training performance comparison
models_train_comp_ab_classifier = pd.concat(
[
ab_classifier_model_train_perf.T,
ab_classifier_over_train_perf.T,
ab_classifier_reg_train_perf.T,
ab_classifier_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost",
"AdaBoost with oversampled data",
"Regularised AdaBoost",
"AdaBoost with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost | AdaBoost with oversampled data | Regularised AdaBoost | AdaBoost with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.954 | 0.945 | 0.962 | 0.926 |
| Recall | 0.810 | 0.865 | 0.920 | 0.936 |
| Precision | 0.895 | 0.806 | 0.857 | 0.703 |
| F1 | 0.850 | 0.834 | 0.888 | 0.803 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
ab_classifier_model_val_perf.T,
ab_classifier_over_val_perf.T,
ab_classifier_reg_val_perf.T,
ab_classifier_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost",
"AdaBoost with oversampled data",
"Regularised AdaBoost",
"AdaBoost with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| AdaBoost | AdaBoost with oversampled data | Regularised AdaBoost | AdaBoost with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.962 | 0.946 | 0.950 | 0.929 |
| Recall | 0.853 | 0.880 | 0.887 | 0.960 |
| Precision | 0.906 | 0.804 | 0.816 | 0.705 |
| F1 | 0.878 | 0.840 | 0.850 | 0.813 |
# Calculating different metrics on validation set
ab_classifier_under_test_perf = model_performance_classification_sklearn(
ab_classifier_under, X_test, y_test
)
print("Test performance:")
ab_classifier_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.928 | 0.960 | 0.701 | 0.810 |
# creating confusion matrix
confusion_matrix_sklearn(ab_classifier_under, X_test, y_test)
gb_classifier = GradientBoostingClassifier(random_state=1)
gb_classifier.fit(X_train, y_train)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=gb_classifier, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
gb_classifier_model_train_perf = model_performance_classification_sklearn(
gb_classifier, X_train, y_train
)
print("Training performance:")
gb_classifier_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.973 | 0.875 | 0.956 | 0.914 |
# Calculating different metrics on validation set
gb_classifier_model_val_perf = model_performance_classification_sklearn(
gb_classifier, X_val, y_val
)
print("Validation performance:")
gb_classifier_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.969 | 0.859 | 0.946 | 0.900 |
# creating confusion matrix
confusion_matrix_sklearn(gb_classifier, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
gb_classifier_over = GradientBoostingClassifier(random_state=1)
# Training the gb_classifier_over model with training set
gb_classifier_over.fit(X_train_over, y_train_over)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=gb_classifier_over,
X=X_train_over,
y=y_train_over,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
gb_classifier_over_train_perf = model_performance_classification_sklearn(
gb_classifier_over, X_train_over, y_train_over
)
print("Training performance:")
gb_classifier_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.979 | 0.981 | 0.977 | 0.979 |
# Calculating different metrics on validation set
gb_classifier_over_val_perf = model_performance_classification_sklearn(
gb_classifier_over, X_val, y_val
)
print("validation performance:")
gb_classifier_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.957 | 0.890 | 0.850 | 0.870 |
# creating confusion matrix
confusion_matrix_sklearn(gb_classifier_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
gb_classifier_under = GradientBoostingClassifier(random_state=1)
gb_classifier_under.fit(X_train_un, y_train_un)
GradientBoostingClassifier(random_state=1)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=gb_classifier_under, X=X_train_un, y=y_train_un, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
gb_classifier_under_train_perf = model_performance_classification_sklearn(
gb_classifier_under, X_train_un, y_train_un
)
print("Training performance:")
gb_classifier_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.974 | 0.980 | 0.969 | 0.974 |
# Calculating different metrics on validation set
gb_classifier_under_val_perf = model_performance_classification_sklearn(
gb_classifier_under, X_val, y_val
)
print("Validation performance:")
gb_classifier_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.937 | 0.960 | 0.733 | 0.831 |
# creating confusion matrix
confusion_matrix_sklearn(gb_classifier_under, X_val, y_val)
# Calculating different metrics on train set
gb_classifier_reg_train_perf = model_performance_classification_sklearn(
gb_classifier, X_train_over, y_train_over
)
print("Training performance:")
gb_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.922 | 0.851 | 0.991 | 0.916 |
# Calculating different metrics on validation set
gb_classifier_reg_val_perf = model_performance_classification_sklearn(
gb_classifier, X_val, y_val
)
print("Validation performance:")
gb_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.969 | 0.859 | 0.946 | 0.900 |
# creating confusion matrix
confusion_matrix_sklearn(gb_classifier, X_val, y_val)
# Choose the type of classifier.
gb_classifier_estimator = GradientBoostingClassifier(
init=AdaBoostClassifier(random_state=1), random_state=1
)
# Grid of parameters to choose from
parameters = {
"n_estimators": [100, 150, 200, 250],
"subsample": [0.8, 0.9, 1],
"max_features": [0.7, 0.8, 0.9, 1],
}
# Run the grid search
grid_obj = GridSearchCV(gb_classifier_estimator, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
gb_classifier_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
gb_classifier_estimator.fit(X_train_over, y_train_over)
GradientBoostingClassifier(init=AdaBoostClassifier(random_state=1),
max_features=0.9, n_estimators=250, random_state=1,
subsample=1)
# Calculating different metrics on train set
gb_classifier_reg_train_perf = model_performance_classification_sklearn(
gb_classifier_estimator, X_train_over, y_train_over
)
print("Training performance:")
gb_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.990 | 0.992 | 0.988 | 0.990 |
# Calculating different metrics on validation set
gb_classifier_reg_val_perf = model_performance_classification_sklearn(
gb_classifier_estimator, X_val, y_val
)
print("Validation performance:")
gb_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968 | 0.905 | 0.897 | 0.901 |
# training performance comparison
models_train_comp_gb_classifier = pd.concat(
[
gb_classifier_model_train_perf.T,
gb_classifier_over_train_perf.T,
gb_classifier_reg_train_perf.T,
gb_classifier_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Gradient Boosting",
"Gradient Boosting with oversampled data",
"Regularised Gradient Boosting",
"Gradient Boosting with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| Gradient Boosting | Gradient Boosting with oversampled data | Regularised Gradient Boosting | Gradient Boosting with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.962 | 0.946 | 0.950 | 0.929 |
| Recall | 0.853 | 0.880 | 0.887 | 0.960 |
| Precision | 0.906 | 0.804 | 0.816 | 0.705 |
| F1 | 0.878 | 0.840 | 0.850 | 0.813 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
gb_classifier_model_val_perf.T,
gb_classifier_over_val_perf.T,
gb_classifier_reg_val_perf.T,
gb_classifier_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"Gradient Boosting",
"Gradient Boosting with oversampled data",
"Regularised Gradient Boosting",
"Gradient Boosting with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| Gradient Boosting | Gradient Boosting with oversampled data | Regularised Gradient Boosting | Gradient Boosting with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.969 | 0.957 | 0.968 | 0.937 |
| Recall | 0.859 | 0.890 | 0.905 | 0.960 |
| Precision | 0.946 | 0.850 | 0.897 | 0.733 |
| F1 | 0.900 | 0.870 | 0.901 | 0.831 |
# Calculating different metrics on validation set
gb_classifier_under_test_perf = model_performance_classification_sklearn(
gb_classifier_under, X_test, y_test
)
print("Test performance:")
gb_classifier_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.933 | 0.969 | 0.716 | 0.824 |
# creating confusion matrix
confusion_matrix_sklearn(gb_classifier_under, X_test, y_test)
There are 125 False negatives in the gb_classifier under sampled set.
xgb_classifier = XGBClassifier(random_state=1, eval_metric="logloss")
xgb_classifier.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_bfr = cross_val_score(
estimator=xgb_classifier, X=X_train, y=y_train, scoring=scoring, cv=kfold
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_bfr)
plt.show()
# Calculating different metrics on train set
xgb_classifier_model_train_perf = model_performance_classification_sklearn(
xgb_classifier, X_train, y_train
)
print("Training performance:")
xgb_classifier_model_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
xgb_classifier_model_val_perf = model_performance_classification_sklearn(
xgb_classifier, X_val, y_val
)
print("Validation performance:")
xgb_classifier_model_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.970 | 0.887 | 0.923 | 0.905 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_classifier, X_val, y_val)
print("Before UpSampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before UpSampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
sm = SMOTE(
sampling_strategy=1, k_neighbors=5, random_state=1
) # Synthetic Minority Over Sampling Technique
X_train_over, y_train_over = sm.fit_resample(X_train, y_train)
print("After UpSampling, counts of label 'Yes': {}".format(sum(y_train_over == 1)))
print("After UpSampling, counts of label 'No': {} \n".format(sum(y_train_over == 0)))
print("After UpSampling, the shape of train_X: {}".format(X_train_over.shape))
print("After UpSampling, the shape of train_y: {} \n".format(y_train_over.shape))
Before UpSampling, counts of label 'Yes': 976 Before UpSampling, counts of label 'No': 5099 After UpSampling, counts of label 'Yes': 5099 After UpSampling, counts of label 'No': 5099 After UpSampling, the shape of train_X: (10198, 28) After UpSampling, the shape of train_y: (10198,)
xgb_classifier_over = XGBClassifier(random_state=1, eval_metric="logloss")
# Training the xgb_classifier_over model with training set
xgb_classifier_over.fit(X_train_over, y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_over = cross_val_score(
estimator=xgb_classifier_over,
X=X_train_over,
y=y_train_over,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_over)
plt.show()
# Calculating different metrics on train set
xgb_classifier_over_train_perf = model_performance_classification_sklearn(
xgb_classifier_over, X_train_over, y_train_over
)
print("Training performance:")
xgb_classifier_over_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
xgb_classifier_over_val_perf = model_performance_classification_sklearn(
xgb_classifier_over, X_val, y_val
)
print("validation performance:")
xgb_classifier_over_val_perf
validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.968 | 0.902 | 0.899 | 0.900 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_classifier_over, X_val, y_val)
rus = RandomUnderSampler(random_state=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)
print("Before Under Sampling, counts of label 'Yes': {}".format(sum(y_train == 1)))
print("Before Under Sampling, counts of label 'No': {} \n".format(sum(y_train == 0)))
print("After Under Sampling, counts of label 'Yes': {}".format(sum(y_train_un == 1)))
print("After Under Sampling, counts of label 'No': {} \n".format(sum(y_train_un == 0)))
print("After Under Sampling, the shape of train_X: {}".format(X_train_un.shape))
print("After Under Sampling, the shape of train_y: {} \n".format(y_train_un.shape))
Before Under Sampling, counts of label 'Yes': 976 Before Under Sampling, counts of label 'No': 5099 After Under Sampling, counts of label 'Yes': 976 After Under Sampling, counts of label 'No': 976 After Under Sampling, the shape of train_X: (1952, 28) After Under Sampling, the shape of train_y: (1952,)
xgb_classifier_under = XGBClassifier(random_state=1, eval_metric="logloss")
xgb_classifier_under.fit(X_train_un, y_train_un)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.300000012,
max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=1, subsample=1, tree_method='exact',
validate_parameters=1, verbosity=None)
scoring = "recall"
kfold = StratifiedKFold(
n_splits=5, shuffle=True, random_state=1
) # Setting number of splits equal to 5
cv_result_under = cross_val_score(
estimator=xgb_classifier_under,
X=X_train_un,
y=y_train_un,
scoring=scoring,
cv=kfold,
)
# Plotting boxplots for CV scores of model defined above
plt.boxplot(cv_result_under)
plt.show()
# Calculating different metrics on train set
xgb_classifier_under_train_perf = model_performance_classification_sklearn(
xgb_classifier_under, X_train_un, y_train_un
)
print("Training performance:")
xgb_classifier_under_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 1.000 | 1.000 | 1.000 | 1.000 |
# Calculating different metrics on validation set
xgb_classifier_under_val_perf = model_performance_classification_sklearn(
xgb_classifier_under, X_val, y_val
)
print("Validation performance:")
xgb_classifier_under_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.941 | 0.960 | 0.747 | 0.840 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_classifier_under, X_val, y_val)
# Calculating different metrics on train set
xgb_classifier_reg_train_perf = model_performance_classification_sklearn(
xgb_classifier, X_train_over, y_train_over
)
print("Training performance:")
xgb_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.980 | 0.960 | 1.000 | 0.980 |
# Calculating different metrics on validation set
xgb_classifier_reg_val_perf = model_performance_classification_sklearn(
xgb_classifier, X_val, y_val
)
print("Validation performance:")
xgb_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.970 | 0.887 | 0.923 | 0.905 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_classifier, X_val, y_val)
# Choose the type of classifier.
xgb_classifier_estimator = XGBClassifier(random_state=1, eval_metric="logloss")
# Grid of parameters to choose from
parameters = {
"n_estimators": [10, 30, 50],
"scale_pos_weight": [1, 2, 5],
"subsample": [0.7, 0.9, 1],
"learning_rate": [0.05, 0.1, 0.2],
"colsample_bytree": [0.7, 0.9, 1],
"colsample_bylevel": [0.5, 0.7, 1],
}
# Run the grid search
grid_obj = GridSearchCV(xgb_classifier, parameters, scoring="recall")
grid_obj = grid_obj.fit(X_train_over, y_train_over)
# Set the clf to the best combination of parameters
xgb_classifier_estimator = grid_obj.best_estimator_
# Fit the best algorithm to the data.
xgb_classifier_estimator.fit(X_train_over, y_train_over)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=0.5,
colsample_bynode=1, colsample_bytree=0.7, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.05, max_delta_step=0,
max_depth=6, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=10, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=1,
scale_pos_weight=5, subsample=0.7, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgb_classifier_reg_train_perf = model_performance_classification_sklearn(
xgb_classifier_estimator, X_train_over, y_train_over
)
print("Training performance:")
xgb_classifier_reg_train_perf
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.893 | 0.999 | 0.824 | 0.903 |
# Calculating different metrics on validation set
xgb_classifier_reg_val_perf = model_performance_classification_sklearn(
xgb_classifier_estimator, X_val, y_val
)
print("Validation performance:")
xgb_classifier_reg_val_perf
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.812 | 0.975 | 0.460 | 0.625 |
# training performance comparison
models_train_comp_xgb_classifier = pd.concat(
[
xgb_classifier_model_train_perf.T,
xgb_classifier_over_train_perf.T,
xgb_classifier_reg_train_perf.T,
xgb_classifier_under_train_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"XGBoost",
"XGBoost with oversampled data",
"Regularised XGBoost",
"XGBoost with undersampled data",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| XGBoost | XGBoost with oversampled data | Regularised XGBoost | XGBoost with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.969 | 0.957 | 0.968 | 0.937 |
| Recall | 0.859 | 0.890 | 0.905 | 0.960 |
| Precision | 0.946 | 0.850 | 0.897 | 0.733 |
| F1 | 0.900 | 0.870 | 0.901 | 0.831 |
# Validation performance comparison
models_train_comp_df = pd.concat(
[
xgb_classifier_model_val_perf.T,
xgb_classifier_over_val_perf.T,
xgb_classifier_reg_val_perf.T,
xgb_classifier_under_val_perf.T,
],
axis=1,
)
models_train_comp_df.columns = [
"XGBoost",
"XGBoost with oversampled data",
"Regularised XGBoost",
"XGBoost with undersampled data",
]
print("Validation performance comparison:")
models_train_comp_df
Validation performance comparison:
| XGBoost | XGBoost with oversampled data | Regularised XGBoost | XGBoost with undersampled data | |
|---|---|---|---|---|
| Accuracy | 0.970 | 0.968 | 0.812 | 0.941 |
| Recall | 0.887 | 0.902 | 0.975 | 0.960 |
| Precision | 0.923 | 0.899 | 0.460 | 0.747 |
| F1 | 0.905 | 0.900 | 0.625 | 0.840 |
# Calculating different metrics on validation set
xgb_classifier_under_test_perf = model_performance_classification_sklearn(
xgb_classifier_under, X_test, y_test
)
print("Test performance:")
xgb_classifier_under_test_perf
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.936 | 0.960 | 0.727 | 0.828 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_classifier_under, X_test, y_test)
# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
"""
Function to compute different metrics to check classification model performance
model: classifier
predictors: independent variables
target: dependent variable
"""
# predicting using the independent variables
pred = model.predict(predictors)
acc = accuracy_score(target, pred) # to compute Accuracy
recall = recall_score(target, pred) # to compute Recall
precision = precision_score(target, pred) # to compute Precision
f1 = f1_score(target, pred) # to compute F1-score
# creating a dataframe of metrics
df_perf = pd.DataFrame(
{"Accuracy": acc, "Recall": recall, "Precision": precision, "F1": f1,},
index=[0],
)
return df_perf
def confusion_matrix_sklearn(model, predictors, target):
"""
To plot the confusion_matrix with percentages
model: classifier
predictors: independent variables
target: dependent variable
"""
y_pred = model.predict(predictors)
cm = confusion_matrix(target, y_pred)
labels = np.asarray(
[
["{0:0.0f}".format(item) + "\n{0:.2%}".format(item / cm.flatten().sum())]
for item in cm.flatten()
]
).reshape(2, 2)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=labels, fmt="")
plt.ylabel("True label")
plt.xlabel("Predicted label")
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1), 'learning_rate': 1, 'n_estimators': 80}
Score: 0.8596127681841969
CPU times: user 3.64 s, sys: 722 ms, total: 4.36 s
Wall time: 1min 14s
# building model with best parameters
adb_tuned1 = AdaBoostClassifier(
n_estimators=80,
learning_rate=1,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
)
# Fit the model on training data
adb_tuned1.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=80, random_state=1)
# Calculating different metrics on train set
Adaboost_grid_train = model_performance_classification_sklearn(
adb_tuned1, X_train, y_train
)
print("Training performance:")
Adaboost_grid_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.993 | 0.973 | 0.980 | 0.977 |
# Calculating different metrics on validation set
Adaboost_grid_val = model_performance_classification_sklearn(adb_tuned1, X_val, y_val)
print("Validation performance:")
Adaboost_grid_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.965 | 0.865 | 0.913 | 0.888 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned1, X_val, y_val)
%%time
# defining model
model = AdaBoostClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"n_estimators": np.arange(10, 110, 10),
"learning_rate": [0.1, 0.01, 0.2, 0.05, 1],
"base_estimator": [
DecisionTreeClassifier(max_depth=1, random_state=1),
DecisionTreeClassifier(max_depth=2, random_state=1),
DecisionTreeClassifier(max_depth=3, random_state=1),
],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_jobs = -1, n_iter=50, scoring=scorer, cv=5, random_state=1)
#Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(randomized_cv.best_params_,randomized_cv.best_score_))
Best parameters are {'n_estimators': 90, 'learning_rate': 1, 'base_estimator': DecisionTreeClassifier(max_depth=2, random_state=1)} with CV score=0.8544897959183674:
CPU times: user 1.83 s, sys: 163 ms, total: 2 s
Wall time: 26.7 s
# building model with best parameters
adb_tuned2 = AdaBoostClassifier(
n_estimators=90,
learning_rate=1,
random_state=1,
base_estimator=DecisionTreeClassifier(max_depth=2, random_state=1),
)
# Fit the model on training data
adb_tuned2.fit(X_train, y_train)
AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
random_state=1),
learning_rate=1, n_estimators=90, random_state=1)
# Calculating different metrics on train set
Adaboost_random_train = model_performance_classification_sklearn(
adb_tuned2, X_train, y_train
)
print("Training performance:")
Adaboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.996 | 0.986 | 0.991 | 0.988 |
# Calculating different metrics on validation set
Adaboost_random_val = model_performance_classification_sklearn(adb_tuned2, X_val, y_val)
print("Validation performance:")
Adaboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.966 | 0.871 | 0.916 | 0.893 |
# creating confusion matrix
confusion_matrix_sklearn(adb_tuned2, X_val, y_val)
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)
# Parameter grid to pass in GridSearchCV
param_grid = {
"criterion": ["gini", "entropy"],
"max_depth": [3, 4, 5, None],
"min_samples_split": [2, 4, 7, 10, 15],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5)
# Fitting parameters in GridSeachCV
grid_cv.fit(X_train, y_train)
print(
"Best Parameters:{} \nScore: {}".format(grid_cv.best_params_, grid_cv.best_score_)
)
Best Parameters:{'criterion': 'entropy', 'max_depth': None, 'min_samples_split': 4}
Score: 0.791978021978022
# Creating new pipeline with best parameters
dtree_tuned1 = DecisionTreeClassifier(
random_state=1, criterion="entropy", max_depth=None, min_samples_split=4
)
# Fit the model on training data
dtree_tuned1.fit(X_train, y_train)
DecisionTreeClassifier(criterion='entropy', min_samples_split=4, random_state=1)
# Calculating different metrics on train set
dtree_grid_train = model_performance_classification_sklearn(
dtree_tuned1, X_train, y_train
)
print("Training performance:")
dtree_grid_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.998 | 0.991 | 0.996 | 0.993 |
# Calculating different metrics on validation set
dtree_grid_val = model_performance_classification_sklearn(dtree_tuned1, X_val, y_val)
print("Validation performance:")
dtree_grid_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.940 | 0.807 | 0.817 | 0.812 |
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)
# Creating pipeline
model = DecisionTreeClassifier(random_state=1)
# Parameter grid to pass in RandomizedSearchCV
param_grid = {
"criterion": ["gini", "entropy"],
"max_depth": [3, 4, 5, None],
"min_samples_split": [2, 4, 7, 10, 15],
}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
# Calling RandomizedSearchCV
randomized_cv = RandomizedSearchCV(
estimator=model,
param_distributions=param_grid,
n_iter=20,
scoring=scorer,
cv=5,
random_state=1,
)
# Fitting parameters in RandomizedSearchCV
randomized_cv.fit(X_train, y_train)
print(
"Best parameters are {} with CV score={}:".format(
randomized_cv.best_params_, randomized_cv.best_score_
)
)
Best parameters are {'min_samples_split': 4, 'max_depth': None, 'criterion': 'entropy'} with CV score=0.791978021978022:
# Creating new pipeline with best parameters
dtree_tuned2 = DecisionTreeClassifier(
random_state=1, criterion="entropy", max_depth=None, min_samples_split=4
)
# Fit the model on training data
dtree_tuned2.fit(X_train, y_train)
DecisionTreeClassifier(criterion='entropy', min_samples_split=4, random_state=1)
# Calculating different metrics on train set
dtree_random_train = model_performance_classification_sklearn(
dtree_tuned2, X_train, y_train
)
print("Training performance:")
dtree_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.998 | 0.991 | 0.996 | 0.993 |
# Calculating different metrics on validation set
dtree_random_val = model_performance_classification_sklearn(dtree_tuned2, X_val, y_val)
print("Validation performance:")
dtree_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.940 | 0.807 | 0.817 | 0.812 |
# creating confusion matrix
confusion_matrix_sklearn(dtree_tuned1, X_val, y_val)
%%time
#defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
#Parameter grid to pass in GridSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling GridSearchCV
grid_cv = GridSearchCV(estimator=model, param_grid=param_grid, scoring=scorer, cv=5, n_jobs = -1, verbose= 2)
#Fitting parameters in GridSeachCV
grid_cv.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(grid_cv.best_params_,grid_cv.best_score_))
Fitting 5 folds for each of 2304 candidates, totalling 11520 fits
Best parameters are {'gamma': 5, 'learning_rate': 0.1, 'max_depth': 2, 'n_estimators': 100, 'reg_lambda': 10, 'scale_pos_weight': 10, 'subsample': 0.9} with CV score=0.9712925170068027:
CPU times: user 28.6 s, sys: 5.18 s, total: 33.8 s
Wall time: 39min 12s
# building model with best parameters
xgb_tuned1 = XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=10,
subsample=0.9,
learning_rate=0.1,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=2,
)
# Fit the model on training data
xgb_tuned1.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.9, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_grid_train = model_performance_classification_sklearn(
xgb_tuned1, X_train, y_train
)
print("Training performance:")
xgboost_grid_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.901 | 0.986 | 0.620 | 0.761 |
# Calculating different metrics on validation set
xgboost_grid_val = model_performance_classification_sklearn(xgb_tuned1, X_val, y_val)
print("Validation performance:")
xgboost_grid_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.902 | 0.969 | 0.626 | 0.761 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned1, X_val, y_val)
%%time
# defining model
model = XGBClassifier(random_state=1,eval_metric='logloss')
# Parameter grid to pass in RandomizedSearchCV
param_grid={'n_estimators':np.arange(50,150,50),
'scale_pos_weight':[2,5,10],
'learning_rate':[0.01,0.1,0.2,0.05],
'gamma':[0,1,3,5],
'subsample':[0.8,0.9,1],
'max_depth':np.arange(1,5,1),
'reg_lambda':[5,10]}
# Type of scoring used to compare parameter combinations
scorer = metrics.make_scorer(metrics.recall_score)
#Calling RandomizedSearchCV
xgb_tuned2 = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=50, scoring=scorer, cv=5, random_state=1, n_jobs = -1)
#Fitting parameters in RandomizedSearchCV
xgb_tuned2.fit(X_train,y_train)
print("Best parameters are {} with CV score={}:" .format(xgb_tuned2.best_params_,xgb_tuned2.best_score_))
Best parameters are {'subsample': 0.8, 'scale_pos_weight': 10, 'reg_lambda': 10, 'n_estimators': 50, 'max_depth': 2, 'learning_rate': 0.1, 'gamma': 0} with CV score=0.9682260596546313:
CPU times: user 1.34 s, sys: 215 ms, total: 1.55 s
Wall time: 50.3 s
# building model with best parameters
xgb_tuned2 = XGBClassifier(
random_state=1,
n_estimators=50,
scale_pos_weight=10,
gamma=0,
subsample=0.8,
learning_rate=0.1,
eval_metric="logloss",
max_depth=2,
reg_lambda=10,
)
# Fit the model on training data
xgb_tuned2.fit(X_train, y_train)
XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
colsample_bynode=1, colsample_bytree=1, eval_metric='logloss',
gamma=0, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1, max_delta_step=0,
max_depth=2, min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=50, n_jobs=8,
num_parallel_tree=1, random_state=1, reg_alpha=0, reg_lambda=10,
scale_pos_weight=10, subsample=0.8, tree_method='exact',
validate_parameters=1, verbosity=None)
# Calculating different metrics on train set
xgboost_random_train = model_performance_classification_sklearn(
xgb_tuned2, X_train, y_train
)
print("Training performance:")
xgboost_random_train
Training performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.849 | 0.984 | 0.516 | 0.677 |
# Calculating different metrics on validation set
xgboost_random_val = model_performance_classification_sklearn(xgb_tuned2, X_val, y_val)
print("Validation performance:")
xgboost_random_val
Validation performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.846 | 0.966 | 0.512 | 0.670 |
# creating confusion matrix
confusion_matrix_sklearn(xgb_tuned2, X_val, y_val)
# training performance comparison
models_train_comp_df = pd.concat(
[
Adaboost_grid_train.T,
Adaboost_random_train.T,
dtree_grid_train.T,
dtree_random_train.T,
xgboost_grid_train.T,
xgboost_random_train.T,
],
axis=1,
)
models_train_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"Decision Tree Tuned with Grid search",
"Decision Tree Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Training performance comparison:")
models_train_comp_df
Training performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | Decision Tree Tuned with Grid search | Decision Tree Tuned with Random search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 0.993 | 0.996 | 0.998 | 0.998 | 0.901 | 0.849 |
| Recall | 0.973 | 0.986 | 0.991 | 0.991 | 0.986 | 0.984 |
| Precision | 0.980 | 0.991 | 0.996 | 0.996 | 0.620 | 0.516 |
| F1 | 0.977 | 0.988 | 0.993 | 0.993 | 0.761 | 0.677 |
# Validation performance comparison
models_val_comp_df = pd.concat(
[
Adaboost_grid_val.T,
Adaboost_random_val.T,
dtree_grid_train.T,
dtree_random_train.T,
xgboost_grid_val.T,
xgboost_random_val.T,
],
axis=1,
)
models_val_comp_df.columns = [
"AdaBoost Tuned with Grid search",
"AdaBoost Tuned with Random search",
"Decision Tree Tuned with Grid search",
"Decision Tree Tuned with Random search",
"Xgboost Tuned with Grid search",
"Xgboost Tuned with Random Search",
]
print("Validation performance comparison:")
models_val_comp_df
Validation performance comparison:
| AdaBoost Tuned with Grid search | AdaBoost Tuned with Random search | Decision Tree Tuned with Grid search | Decision Tree Tuned with Random search | Xgboost Tuned with Grid search | Xgboost Tuned with Random Search | |
|---|---|---|---|---|---|---|
| Accuracy | 0.965 | 0.966 | 0.998 | 0.998 | 0.902 | 0.846 |
| Recall | 0.865 | 0.871 | 0.991 | 0.991 | 0.969 | 0.966 |
| Precision | 0.913 | 0.916 | 0.996 | 0.996 | 0.626 | 0.512 |
| F1 | 0.888 | 0.893 | 0.993 | 0.993 | 0.761 | 0.670 |
# Calculating different metrics on the test set
dtree_grid_test = model_performance_classification_sklearn(dtree_tuned1, X_test, y_test)
print("Test performance:")
dtree_grid_test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.933 | 0.822 | 0.776 | 0.798 |
# Calculating different metrics on the test set
xgboost_grid_test = model_performance_classification_sklearn(xgb_tuned1, X_test, y_test)
print("Test performance:")
xgboost_grid_test
Test performance:
| Accuracy | Recall | Precision | F1 | |
|---|---|---|---|---|
| 0 | 0.893 | 0.978 | 0.602 | 0.746 |
# Check feature importance
feature_names = X_train.columns
importances = xgb_tuned1.feature_importances_
indices = np.argsort(importances)
plt.figure(figsize=(12, 12))
plt.title("Feature Importances")
plt.barh(range(len(indices)), importances[indices], color="violet", align="center")
plt.yticks(range(len(indices)), [feature_names[i] for i in indices])
plt.xlabel("Relative Importance")
plt.show()
# creating a list of numerical variables
numerical_features = [
"Customer_Age",
"Dependent_count",
"Months_on_book",
"Total_Relationship_Count",
"Months_Inactive_12_mon",
"Contacts_Count_12_mon",
"Credit_Limit",
"Total_Revolving_Bal",
"Avg_Open_To_Buy",
"Total_Amt_Chng_Q4_Q1",
"Total_Trans_Amt",
"Total_Trans_Ct",
"Total_Ct_Chng_Q4_Q1",
"Avg_Utilization_Ratio",
]
# creating a transformer for numerical variables, which will apply simple imputer on the numerical variables
numeric_transformer = Pipeline(steps=[("imputer", SimpleImputer(strategy="median"))])
# creating a list of categorical variables
categorical_features = [
"Gender",
"Education_Level",
"Marital_Status",
"Income_Category",
"Card_Category",
]
# creating a transformer for categorical variables, which will first apply simple imputer and
# then do one hot encoding for categorical variables
categorical_transformer = Pipeline(
steps=[
("imputer", SimpleImputer(strategy="most_frequent")),
("onehot", OneHotEncoder(handle_unknown="ignore")),
]
)
# handle_unknown = "ignore", allows model to handle any unknown category in the test data
# combining categorical transformer and numerical transformer using a column transformer
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numerical_features),
("cat", categorical_transformer, categorical_features),
],
remainder="passthrough",
)
# remainder = "passthrough" has been used, it will allow variables that are present in original data
# but not in "numerical_columns" and "categorical_columns" to pass through the column transformer without any changes
# Separating target variable and other variables
X = data.drop("Attrition_Flag", axis=1)
Y = data["Attrition_Flag"]
# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.30, random_state=1, stratify=Y
)
print(X_train.shape, X_test.shape)
(7088, 20) (3039, 20)
df3.head()
| Customer_Age | Gender | Dependent_count | Education_Level | Marital_Status | Income_Category | Card_Category | Months_on_book | Total_Relationship_Count | Months_Inactive_12_mon | Contacts_Count_12_mon | Credit_Limit | Total_Revolving_Bal | Avg_Open_To_Buy | Total_Amt_Chng_Q4_Q1 | Total_Trans_Amt | Total_Trans_Ct | Total_Ct_Chng_Q4_Q1 | Avg_Utilization_Ratio | Existing_Customer | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 45 | M | 3 | High School | Married | $60K - $80K | Blue | 39 | 5 | 1 | 3 | 12691.000 | 777 | 11914.000 | 1.335 | 1144 | 42 | 1.625 | 0.061 | 1 |
| 1 | 49 | F | 5 | Graduate | Single | Less than $40K | Blue | 44 | 6 | 1 | 2 | 8256.000 | 864 | 7392.000 | 1.541 | 1291 | 33 | 3.714 | 0.105 | 1 |
| 2 | 51 | M | 3 | Graduate | Married | $80K - $120K | Blue | 36 | 4 | 1 | 0 | 3418.000 | 0 | 3418.000 | 2.594 | 1887 | 20 | 2.333 | 0.000 | 1 |
| 3 | 40 | F | 4 | High School | NaN | Less than $40K | Blue | 34 | 3 | 4 | 1 | 3313.000 | 2517 | 796.000 | 1.405 | 1171 | 20 | 2.333 | 0.760 | 1 |
| 4 | 40 | M | 3 | Uneducated | Married | $60K - $80K | Blue | 21 | 5 | 1 | 0 | 4716.000 | 0 | 4716.000 | 2.175 | 816 | 28 | 2.500 | 0.000 | 1 |
# Creating new pipeline with best parameters
model = Pipeline(
steps=[
("pre", preprocessor),
(
"XGB",
XGBClassifier(
random_state=1,
n_estimators=100,
scale_pos_weight=10,
subsample=0.9,
learning_rate=0.1,
gamma=5,
eval_metric="logloss",
reg_lambda=10,
max_depth=2,
),
),
]
)
# Fit the model on training data
model.fit(X_train, y_train)
Pipeline(steps=[('pre',
ColumnTransformer(remainder='passthrough',
transformers=[('num',
Pipeline(steps=[('imputer',
SimpleImputer(strategy='median'))]),
['Customer_Age',
'Dependent_count',
'Months_on_book',
'Total_Relationship_Count',
'Months_Inactive_12_mon',
'Contacts_Count_12_mon',
'Credit_Limit',
'Total_Revolving_Bal',
'Avg_Open_To_Buy',
'Total_Amt_Chng_Q4_...
gamma=5, gpu_id=-1, importance_type='gain',
interaction_constraints='', learning_rate=0.1,
max_delta_step=0, max_depth=2,
min_child_weight=1, missing=nan,
monotone_constraints='()', n_estimators=100,
n_jobs=8, num_parallel_tree=1, random_state=1,
reg_alpha=0, reg_lambda=10, scale_pos_weight=10,
subsample=0.9, tree_method='exact',
validate_parameters=1, verbosity=None))])